1. Load library

library(data.table)
library(ggthemes)
library(tidyverse)
library(GGally)
library(plotly)
library(plotly)
library(mde)
library(corrplot)
library(viridis)
library(caTools)
theme_set(theme_few())

2. Load Data

salary <- as_tibble(fread('NBA_season1718_salary.csv'))
ss <- as_tibble(fread('Seasons_Stats.csv'))

3. Data Overview

3.1 salary

head(salary)
## # A tibble: 6 × 4
##      V1 Player         Tm    season17_18
##   <int> <chr>          <chr>       <dbl>
## 1     1 Stephen Curry  GSW      34682550
## 2     2 LeBron James   CLE      33285709
## 3     3 Paul Millsap   DEN      31269231
## 4     4 Gordon Hayward BOS      29727900
## 5     5 Blake Griffin  DET      29512900
## 6     6 Kyle Lowry     TOR      28703704
summary(salary)
##        V1         Player               Tm             season17_18      
##  Min.   :  1   Length:573         Length:573         Min.   :   17224  
##  1st Qu.:144   Class :character   Class :character   1st Qu.: 1312611  
##  Median :287   Mode  :character   Mode  :character   Median : 2386864  
##  Mean   :287                                         Mean   : 5858946  
##  3rd Qu.:430                                         3rd Qu.: 7936509  
##  Max.   :573                                         Max.   :34682550

After viewing the head and summary, we have noticed several things a).colname “season17_18” should be changed to “salary”; b).there is no null values in the salary table; c).this table is already ranked based off salary.

3.2 season statistics

head(ss)
## # A tibble: 6 × 53
##      V1  Year Player      Pos     Age Tm        G    GS    MP   PER `TS%` `3PAr`
##   <int> <int> <chr>       <chr> <int> <chr> <int> <int> <int> <dbl> <dbl>  <dbl>
## 1     0  1950 Curly Arms… G-F      31 FTW      63    NA    NA    NA 0.368     NA
## 2     1  1950 Cliff Bark… SG       29 INO      49    NA    NA    NA 0.435     NA
## 3     2  1950 Leo Barnho… SF       25 CHS      67    NA    NA    NA 0.394     NA
## 4     3  1950 Ed Bartels  F        24 TOT      15    NA    NA    NA 0.312     NA
## 5     4  1950 Ed Bartels  F        24 DNN      13    NA    NA    NA 0.308     NA
## 6     5  1950 Ed Bartels  F        24 NYK       2    NA    NA    NA 0.376     NA
## # … with 41 more variables: FTr <dbl>, `ORB%` <dbl>, `DRB%` <dbl>,
## #   `TRB%` <dbl>, `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>,
## #   `USG%` <dbl>, blanl <lgl>, OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/48` <dbl>,
## #   blank2 <lgl>, OBPM <dbl>, DBPM <dbl>, BPM <dbl>, VORP <dbl>, FG <int>,
## #   FGA <int>, `FG%` <dbl>, `3P` <int>, `3PA` <int>, `3P%` <dbl>, `2P` <int>,
## #   `2PA` <int>, `2P%` <dbl>, `eFG%` <dbl>, FT <int>, FTA <int>, `FT%` <dbl>,
## #   ORB <int>, DRB <int>, TRB <int>, AST <int>, STL <int>, BLK <int>, …
# summary(ss)

After viewing the head and summary, we have noticed several things a).this table contains lots of columns as well as NA values. b). stat data ranges from 1950 to …. c). there were players who played for different teams within a season d). completely empty columns: blani/blank2 which should be dropped

4. Data Cleaning

4.1 Data merge and select

  • we want to just use the most recent data 2016-2017 season
  • we will merge salary and stats17
  • we will get rid of empty columns and keep only columns that ‘I’ understand :) as a non basketball analytic expert
ss17 <- ss %>% filter(Year==2017)
df <- merge(ss17, salary, by.x='Player',by.y = 'Player')
df <- df %>% select(
   Player, 
   Age,
   Team = Tm.y,
   ORB,DRB,AST,STL,BLK,TOV,PTS,
   G,GS,MP,Eff= PER,"TS%",FG,FGA,'FG%',"3P","3P%","2P","2PA","2P%",
   Salary=season17_18)

4.2 Null

#null summary
na_summary(df,sort_by = "percent_complete") 
##    variable missing complete percent_complete percent_missing
## 5       3P%      33      519         94.02174       5.9782609
## 2       2P%       3      549         99.45652       0.5434783
## 12      FG%       1      551         99.81884       0.1811594
## 24      TS%       1      551         99.81884       0.1811594
## 1        2P       0      552        100.00000       0.0000000
## 3       2PA       0      552        100.00000       0.0000000
## 4        3P       0      552        100.00000       0.0000000
## 6       Age       0      552        100.00000       0.0000000
## 7       AST       0      552        100.00000       0.0000000
## 8       BLK       0      552        100.00000       0.0000000
## 9       DRB       0      552        100.00000       0.0000000
## 10      Eff       0      552        100.00000       0.0000000
## 11       FG       0      552        100.00000       0.0000000
## 13      FGA       0      552        100.00000       0.0000000
## 14        G       0      552        100.00000       0.0000000
## 15       GS       0      552        100.00000       0.0000000
## 16       MP       0      552        100.00000       0.0000000
## 17      ORB       0      552        100.00000       0.0000000
## 18   Player       0      552        100.00000       0.0000000
## 19      PTS       0      552        100.00000       0.0000000
## 20   Salary       0      552        100.00000       0.0000000
## 21      STL       0      552        100.00000       0.0000000
## 22     Team       0      552        100.00000       0.0000000
## 23      TOV       0      552        100.00000       0.0000000
# we will just drop missing values because they weren't important 
df <- na.omit(df)

5. EDA

5.1 correlation

nu_df <- df %>% select(where(is.numeric))
corrplot(cor(nu_df),method='pie')

cor_salary <- as.data.frame(cor(nu_df)[,'Salary'])
names(cor_salary) <- 'Salary'
cor_salary %>% arrange(desc(Salary))
##            Salary
## Salary 1.00000000
## PTS    0.71886201
## FG     0.71647169
## FGA    0.69445520
## 2P     0.68189697
## 2PA    0.67212484
## GS     0.67033193
## MP     0.65208864
## DRB    0.63334474
## TOV    0.63074911
## STL    0.57254137
## AST    0.53944238
## Eff    0.52858248
## 3P     0.50025037
## ORB    0.45689948
## BLK    0.45180786
## G      0.42357945
## TS%    0.25298405
## Age    0.19570007
## FG%    0.18930759
## 2P%    0.07659004
## 3P%    0.05295982
  • we want to see all the selected variables’ correlations with the salary.
  • we can see that PTS,FG,FGA are strongly correlated with salaries -> which might indicate that the more you score, the more money you will earn.
  • it is quite surprising that 3P%, 2P%, TS% and FG% have little to do with salary
  • (we will verify this later in the machine learning part)

5.2 graphs

5.2.1 PTS VS Salary

since PTS is most correlated with salary, let’s look closely how the graph will look like

s_p <- ggplot(df,aes(Salary/1000000,PTS,color=Player)) + geom_point(alpha=0.5) + labs(x='Salary(million)', y='Total Points Scored', title='Salary vs Point')  +theme(plot.title = element_text(hjust=.5,size=14,face='bold'),legend.position = "none") + scale_y_continuous(breaks=seq(0,3000,500))+scale_x_continuous(breaks=seq(0,40,5)) 

ggplotly(s_p)

You can see that in general, pts and salary are positively correlated: if you scored more, you will get paid more.

5.2.2 Team Salary

The National Basketball Association today announced that the Salary Cap has been set at $99.093 million for the 2017-18 season.

df %>% group_by(Team) %>% summarize(ts = sum(Salary)) %>% 
  ggplot(aes(reorder(Team,ts),ts/1000000)) + geom_col(color='black',fill=' tomato') + coord_flip()+labs(x='Total Salary(million)',y=NULL,title='Total Salary per Team') + theme(plot.title = element_text(size=14, hjust=.5)) + geom_hline(yintercept = 99.093,linetype='dotted',color='blue',size=1.5)

We can see that nearly 2/3 of NBA teams’ salary are above the Salary Cap 99.093 million. It might be interesting to acquire team winning data and further compare them with total salary to see if they are correlated.

6. Machine Learning (Linear Regression)

6.1 Linear Regression

sample <- sample.split(df$Salary,.7)

train <- subset(df, sample==T)
test <- subset(df, sample==F)

6.2 Training Model

model <- lm(Salary~.,train[,c('Salary',"ORB","DRB","AST","STL","BLK","TOV","PTS")])
summary(model)
## 
## Call:
## lm(formula = Salary ~ ., data = train[, c("Salary", "ORB", "DRB", 
##     "AST", "STL", "BLK", "TOV", "PTS")])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13962098  -2568200   -182771   1875066  20374075 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3130     431957  -0.007 0.994222    
## ORB            11191       9890   1.132 0.258583    
## DRB             9161       5055   1.812 0.070789 .  
## AST            19621       5010   3.917 0.000108 ***
## STL           -15780      14745  -1.070 0.285264    
## BLK            12400      14271   0.869 0.385519    
## TOV           -51110      15491  -3.299 0.001068 ** 
## PTS            11240       1385   8.113 8.25e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4850000 on 353 degrees of freedom
## Multiple R-squared:  0.5635, Adjusted R-squared:  0.5548 
## F-statistic: 65.09 on 7 and 353 DF,  p-value: < 2.2e-16
names(df)
##  [1] "Player" "Age"    "Team"   "ORB"    "DRB"    "AST"    "STL"    "BLK"   
##  [9] "TOV"    "PTS"    "G"      "GS"     "MP"     "Eff"    "TS%"    "FG"    
## [17] "FGA"    "FG%"    "3P"     "3P%"    "2P"     "2PA"    "2P%"    "Salary"

We only limited our predictors to ORB : PTS as these data are most readily available and we can acquire them online to predict a player’s next season salary. * From the summary table, we can see that PTS,TOV,BLK,AST are four most important factors when it comes to NBA salary prediction.

6.3 Residuals

res <- as.data.frame(residuals(model))
names(res) <- "residual"
ggplot(res,aes(residual)) + geom_histogram(fill='tomato',color='black')

The residual histogram is normally distributed

6.4 Test predictions

Salary.predictions <- predict(model,test)
results <- cbind(Salary.predictions,test$Salary)
colnames(results) <- c('pred','actual')
results <- as.data.frame(results)
results
##            pred   actual
## 1     595240.34  1312611
## 2    3081317.68  2116955
## 3   13276742.12  5504420
## 10   8157402.37  4187599
## 11    926476.30   778668
## 14   8182791.44 11000000
## 17  10797863.75 14814815
## 21   1864238.79  2328652
## 22   5003023.52  1312611
## 27  26476917.68 23775506
## 28   5847539.93  3290000
## 29   5847539.93  2000000
## 39   3556885.13    25000
## 49   2756381.83  5179760
## 52   8502511.91   119010
## 54   5454389.13 13618750
## 55   2540512.09   119602
## 59    693029.18    50000
## 70   3085413.09  3675480
## 71   8990210.11  3675480
## 74   3184058.41   333334
## 75   1347657.64  2203440
## 76   1679161.40  2203440
## 88    472998.97  1471382
## 89    510325.95  1471382
## 97   9370375.28  7630000
## 101  8929209.56  5562360
## 103 23931424.68 26153057
## 105 13996221.28 20559599
## 107  6544705.06  2300000
## 108  3824153.95  4992385
## 116   951661.72    26773
## 118   951661.72  1577230
## 119   125449.58    26773
## 121   125449.58  1577230
## 124  1080241.53  1577230
## 126   802827.88  1312611
## 127  1706091.48  1645200
## 128 21800833.78 27739975
## 134   208838.49    50000
## 135   208838.49    92858
## 136 13882812.40 15500000
## 139  8871708.83  5500000
## 140  6822717.66  5500000
## 141  6146172.21 12000000
## 143 12911638.83  2116955
## 144  3627577.09   104059
## 146  1783658.24   104059
## 148  5258046.47  4402546
## 154 10921201.80  5000000
## 161 14343746.54 16400000
## 164 11519975.14 15550000
## 165 11519975.14  2328652
## 170 11413788.37 20566802
## 173  3358311.48  6000000
## 175  9421863.37  6000000
## 178 12882788.52   502328
## 179    96353.22  6000000
## 182  7292043.94 17131148
## 187    77840.51    50000
## 195  5698821.27  1524305
## 200 11432182.25 17884176
## 201 17114108.59 23112004
## 209  1861426.09    53465
## 210   115586.40  1312611
## 211 23812483.95  6261395
## 214  3972526.34  1312611
## 221   654448.76  1312611
## 224  9116307.93 10942762
## 226  9758380.02  2262871
## 228  5081685.78  3028410
## 229 25335477.24 28299399
## 230 11068106.90 13954000
## 231  9603586.09  8533333
## 234   265900.42    17224
## 237   -22473.86  2328652
## 238  5323438.54  5225000
## 241  5343510.46  4956480
## 249  5875964.09 12000000
## 251   278902.68  9000000
## 261   354054.72  1312611
## 267 21316220.13 18063850
## 276  3009440.83   250000
## 290  3479089.97  1579440
## 291  6466363.98  1000000
## 296  7262002.62  2947305
## 298  1047914.99   100000
## 300 26847989.76  6216840
## 306  9119844.00 12921348
## 312  3994467.49 14100000
## 314  7042836.32  8393000
## 320  3477072.21  7000000
## 323  1007896.91  4666500
## 324 20623542.80 18868625
## 325 17218249.77 21461010
## 330  3593359.62  6655325
## 332  5715143.48  6666667
## 336  4435728.86   500000
## 338  9907133.44  7000000
## 341  4001701.10  2947305
## 342  4199568.25  1974159
## 353   179575.69  1312611
## 355  9808702.72  4538020
## 362 13037849.47 13168750
## 370  8991807.52 10162922
## 374   112615.95   500000
## 379  2338868.53  1662500
## 380  6327230.16  5000000
## 381   760698.36  1709538
## 390  5029057.62  3949999
## 393  3048472.01  4187599
## 395   685152.80  2328652
## 397 15595026.50 22434783
## 400 18765123.35  1471382
## 408    49883.71  2106470
## 411 15891865.32 24773250
## 412  2155163.67  7590035
## 421 12998521.46 16000000
## 422 16844546.93 19508958
## 426   118121.49  1709538
## 428  2831258.36  1709538
## 431   733592.79   100000
## 433   138629.40   100000
## 438  4001851.30   263124
## 440  1609903.96  1889040
## 442  1034921.16  1471382
## 444  2005501.44  2500000
## 445  8508105.35 16000000
## 446  5526283.17  2328652
## 452  8385729.61  2386864
## 457  5580350.44  8406000
## 459 29323623.24 28530608
## 464  8414394.20   172238
## 465  8414394.20   789725
## 467  3760161.22 20061729
## 468 11604431.65 20061729
## 472  5067424.38  7692308
## 473  3839538.78  6000000
## 477  4831926.64  6021175
## 484  8985640.85  1471382
## 485 11888223.91  3152931
## 487 10500517.90 14000000
## 488  8363995.72 14000000
## 490  2869074.78  2422560
## 492  8845499.87 10500000
## 500  4429550.54 15280000
## 502 16251509.77 16000000
## 505  8768795.39 15453126
## 518   663061.87  1471382
## 525  3172709.74  1709538
## 532 11135906.52 21000000
## 536  7872892.96  6270000
## 537   442621.26  1312611
## 546 13457617.24 12016854
## 548  5493246.82  1312611
## 550  9342471.70  3202217
SSE = sum((results$pred - results$actual)^2)
SST = sum( (mean(df$Salary) - results$actual)^2)

R2 = 1 - SSE/SST
R2
## [1] 0.504912

hm.. low R2 is not a good sign but let’s keep exploring

6.5 Predictions

*Let’s predict Stephen Curry(the best player in the world:)) salary of next season

curry_stas <- data.frame(name = 'Stephen Curry', PTS = 1999,AST = 524,BLK=17,TOV=239 )

n_model <- lm(Salary~., df[,c('Salary','PTS','AST','BLK','TOV')])
outcome <- predict(n_model,curry_stas)
print(paste('pred:', outcome, 'real: 45780000'))
## [1] "pred: 21643871.0410019 real: 45780000"